Set up our necessary packages. Uncomment the install.packages line the first time you run through this. Set your home directory to be the src root of this project. You’ll need to change this before you get started.
# install.packages(c("plyr", "readr", "ggplot2", "dply", "fitdistrplus", "anytime", "data.table", "knitr", "tinytex"))
library(plyr)
library(readr)
library(ggplot2)
library(dplyr)
library(fitdistrplus)
library(anytime)
library(data.table)
library(knitr)
setwd('/Users/daytonpe/Dropbox/utd/6316_stat_methods_for_ds_akcora/project/src')
Load in all of our data. The modulo of the sum of our UTD IDs was 2, so we will be using Tronix, Omisego, and YoCoin for our analysis.
# First our price files
omg_price_df = read.table("./tokenPrices/omisego.txt",
col.names = c('Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'MarketCap'),
skip = 1,
header = FALSE)
trn_price_df = read.table("./tokenPrices/tron",
col.names = c('Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'MarketCap'),
skip = 1,
header = FALSE)
yoc_price_df = read.table("./tokenPrices/yocoin",
col.names = c('Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'MarketCap'),
skip = 1,
header = FALSE)
# Next our edge files
omg_edge_df <- read_delim('./edgeFiles/omisego.txt', delim = " ", col_names = F)
trn_edge_df <- read_delim('./edgeFiles/tron.txt', delim = " ", col_names = F)
yoc_edge_df <- read_delim('./edgeFiles/yo.txt', delim = " ", col_names = F)
# and label these as well
names(omg_edge_df) <- c('fromID', 'toID', 'unixTime', 'tokenAmount')
names(trn_edge_df) <- c('fromID', 'toID', 'unixTime', 'tokenAmount')
names(yoc_edge_df) <- c('fromID', 'toID', 'unixTime', 'tokenAmount')
Check for duplicated values in all of our files and remove them.
cat("omg_price_df duplicates: ", anyDuplicated(omg_price_df), " \n")
## omg_price_df duplicates: 0
cat("omg_edge_df duplicates: ", anyDuplicated(omg_price_df), " \n")
## omg_edge_df duplicates: 0
omg_price_df <- omg_price_df %>% distinct()
omg_edge_df <- omg_edge_df %>% distinct()
cat("omg_edge_df duplicates: ", anyDuplicated(omg_edge_df), " \n") # after duplicates removed
## omg_edge_df duplicates: 0
cat("omg_price_df duplicates: ", anyDuplicated(omg_price_df), " \n") # after duplicates removed
## omg_price_df duplicates: 0
cat("trn_price_df duplicates: ", anyDuplicated(trn_price_df), " \n")
## trn_price_df duplicates: 0
cat("trn_edge_df duplicates: ", anyDuplicated(trn_edge_df), " \n")
## trn_edge_df duplicates: 1536
trn_price_df <- trn_price_df %>% distinct()
trn_edge_df <- trn_edge_df %>% distinct()
cat("trn_price_df duplicates: ", anyDuplicated(trn_price_df), " \n") # after duplicates removed
## trn_price_df duplicates: 0
cat("trn_edge_df duplicates: ", anyDuplicated(trn_edge_df), " \n") # after duplicates removed
## trn_edge_df duplicates: 0
cat("yoc_price_df duplicates: ", anyDuplicated(yoc_price_df), " \n")
## yoc_price_df duplicates: 0
cat("yoc_edge_df duplicates: ", anyDuplicated(yoc_edge_df), " \n")
## yoc_edge_df duplicates: 992
yoc_price_df <- yoc_price_df %>% distinct()
yoc_edge_df <- yoc_edge_df %>% distinct()
cat("yoc_price_df duplicates: ", anyDuplicated(yoc_price_df), " \n") # after duplicates removed
## yoc_price_df duplicates: 0
cat("yoc_edge_df duplicates: ", anyDuplicated(yoc_edge_df), " \n") # after duplicates removed
## yoc_edge_df duplicates: 0
Convert the date to the correct format in the price data frames.
omg_price_df$Date = as.Date(omg_price_df$Date,format='%m/%d/%Y')
trn_price_df$Date = as.Date(trn_price_df$Date,format='%m/%d/%y')
yoc_price_df$Date = as.Date(yoc_price_df$Date,format='%m/%d/%y')
Set our constants for each coin, then remove edge file rows where token amount is too big to make sense. Note: Only YOC had records needing to be removed.
omg_decimals = 10^18
trn_decimals = 10^6
yoc_decimals = 10^16
omg_supply = 140245398
trn_supply = 66682072191
yoc_supply = 369659255
omg_edge_df_filtered = omg_edge_df %>% filter(tokenAmount < omg_decimals * omg_supply)
cat("Num Rows before Filtering: ", nrow(omg_edge_df), "\n")
## Num Rows before Filtering: 1143029
cat("Num Rows after Filtering: ", nrow(omg_edge_df_filtered), "\n")
## Num Rows after Filtering: 1143018
cat("Num Rows cut: ", (nrow(omg_edge_df)-nrow(omg_edge_df_filtered)), "\n")
## Num Rows cut: 11
omg_edge_df = omg_edge_df %>% filter(tokenAmount <= omg_decimals * omg_supply)
tron_edge_df_filtered = trn_edge_df %>% filter(tokenAmount < trn_decimals*trn_supply)
cat("Num Rows before Filtering: ", nrow(trn_edge_df), "\n")
## Num Rows before Filtering: 1512662
cat("Num Rows after Filtering: ", nrow(tron_edge_df_filtered), "\n")
## Num Rows after Filtering: 1512580
cat("Num Rows cut: ", (nrow(trn_edge_df)-nrow(tron_edge_df_filtered)), "\n")
## Num Rows cut: 82
trn_edge_df = trn_edge_df %>% filter(tokenAmount <= trn_decimals * trn_supply)
yocoin_edge_df_filtered = yoc_edge_df %>% filter(yoc_edge_df$tokenAmount < yoc_decimals * yoc_supply)
cat("Num Rows before Filtering: ", nrow(yoc_edge_df), "\n")
## Num Rows before Filtering: 595582
cat("Num Rows after Filtering: ", nrow(yocoin_edge_df_filtered), "\n")
## Num Rows after Filtering: 595492
cat("Num Rows cut: ", (nrow(yoc_edge_df)-nrow(yocoin_edge_df_filtered)), "\n")
## Num Rows cut: 90
yoc_edge_df = yoc_edge_df %>% filter(tokenAmount <= yoc_decimals * yoc_supply)
Update the edge data frame dates to be the correct format.
omg_edge_df$Date = anydate(omg_edge_df$unixTime)
trn_edge_df$Date = anydate(trn_edge_df$unixTime)
yoc_edge_df$Date = anydate(yoc_edge_df$unixTime)
Determine some extrea features on which we can create our multiple linear regressions.
Calculate number of buys and sells by user_id Great description here: https://stackoverflow.com/questions/25869378/what-does-n-n-mean-in-r
omg_buys <- omg_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
trn_buys <- trn_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
yoc_buys <- yoc_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
omg_sells <- omg_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup
trn_sells <- trn_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup
yoc_sells <- yoc_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup
Filter to only include top K buyers and build a dataframe with the summarized data for fitting a regression model. Features we create here include: - Avg_Tok_Amt: Average Token Amount traded for the top-K users on the given day - Tot_Tok_Amt: Total Token Amount traded by the top-K users on the given day - Transactions: Number of transactions by the top-K users on the given day - Distinct Buyers: Distinct number of buyers for a given day - Distinct Sellers: Distinct number of sellers for a given day
K_omg = 104
K_trn = 18000
K_yoc = 136
# Filter to only include top K buyers
omg_buys = omg_buys %>% arrange(-n) %>% head(K_omg)
trn_buys = trn_buys %>% arrange(-n) %>% head(K_trn)
yoc_buys = yoc_buys %>% arrange(-n) %>% head(K_yoc)
omg_top_k_buys <- omg_edge_df %>% filter(omg_edge_df$toID %in% omg_buys$toID)
trn_top_k_buys <- trn_edge_df %>% filter(trn_edge_df$toID %in% trn_buys$toID)
yoc_top_k_buys <- yoc_edge_df %>% filter(yoc_edge_df$toID %in% yoc_buys$toID)
# Create a dataframe with summarized data for fitting a regression model
omg_fit_data <- omg_top_k_buys %>% group_by(Date) %>%
summarise(
Avg_Tok_Amt = mean(tokenAmount),
Tot_Tok_Amt = sum(tokenAmount),
Transactions = n(),
Distinct_Buyers = n_distinct(toID),
Distinct_Sellers = n_distinct(fromID)
) %>%
ungroup
trn_fit_data <- trn_top_k_buys %>% group_by(Date) %>%
summarise(
Avg_Tok_Amt = mean(tokenAmount),
Tot_Tok_Amt = sum(tokenAmount),
Transactions = n(),
Distinct_Buyers = n_distinct(toID),
Distinct_Sellers = n_distinct(fromID)
) %>%
ungroup
yoc_fit_data <- yoc_top_k_buys %>% group_by(Date) %>%
summarise(
Avg_Tok_Amt = mean(tokenAmount),
Tot_Tok_Amt = sum(tokenAmount),
Transactions = n(),
Distinct_Buyers = n_distinct(toID),
Distinct_Sellers = n_distinct(fromID)
) %>%
ungroup
Join edge data to pricing data based on the Date. We lose a small percentage of the data here due to the fact that the timeframes for the two data files do not match perfectly.
omg_fit_data <- merge(omg_fit_data, omg_price_df, by="Date")
trn_fit_data <- merge(trn_fit_data, trn_price_df, by="Date")
yoc_fit_data <- merge(yoc_fit_data, yoc_price_df, by="Date")
Calculate the close values of the previous 3 days. Note: m1 refers to minus 1, i.e. one day previous
omg_fit_data$Close_m1 <- shift(omg_fit_data$Close, n=1)
omg_fit_data$Close_m2 <- shift(omg_fit_data$Close, n=2)
omg_fit_data$Close_m3 <- shift(omg_fit_data$Close, n=3)
trn_fit_data$Close_m1 <- shift(trn_fit_data$Close, n=1)
trn_fit_data$Close_m2 <- shift(trn_fit_data$Close, n=2)
trn_fit_data$Close_m3 <- shift(trn_fit_data$Close, n=3)
yoc_fit_data$Close_m1 <- shift(yoc_fit_data$Close, n=1)
yoc_fit_data$Close_m2 <- shift(yoc_fit_data$Close, n=2)
yoc_fit_data$Close_m3 <- shift(yoc_fit_data$Close, n=3)
Let’s take a look at our data with our newly engineered features on which we will fit our multiple regression model.
omg_fit_data
trn_fit_data
yoc_fit_data
Let’s also take a look at how many days are tracked for the three tokens in our data sets. We have the most data for YOC.
cat("OMG Rows: ", nrow(omg_fit_data), "\n")
## OMG Rows: 297
cat("TRX Rows: ", nrow(trn_fit_data), "\n")
## TRX Rows: 236
cat("YOC Rows: ", nrow(yoc_fit_data), "\n")
## YOC Rows: 422
We chose to regress to the Close value of the token, so we will compare the correlation of each of the regressors (Xs) to the Close (Y).
We can make the observation from this data that the previous day’s prices are far more correlated to the Close price on the day when compared to the token amounts, distinct buyers, and other features we engineered. This is expected.
cat("Transactions: ", cor(omg_fit_data$Close, omg_fit_data$Transactions), "\n")
## Transactions: 0.3133622
cat("Total Token Amount: ", cor(omg_fit_data$Close, omg_fit_data$Tot_Tok_Amt), "\n")
## Total Token Amount: -0.2566245
cat("Average Token Amount:", cor(omg_fit_data$Close, omg_fit_data$Avg_Tok_Amt), "\n")
## Average Token Amount: -0.4220751
cat("Distinct Buyers: ", cor(omg_fit_data$Close, omg_fit_data$Distinct_Buyers), "\n")
## Distinct Buyers: 0.6134334
cat("Distinct Sellers: ", cor(omg_fit_data$Close, omg_fit_data$Distinct_Sellers), "\n")
## Distinct Sellers: 0.2075846
cat("Close Minus 1: ", cor(omg_fit_data$Close, omg_fit_data$Close_m1, use = "complete.obs"), "\n")
## Close Minus 1: 0.9786079
cat("Close Minus 2: ", cor(omg_fit_data$Close, omg_fit_data$Close_m2, use = "complete.obs"), "\n")
## Close Minus 2: 0.9591295
cat("Close Minus 3: ", cor(omg_fit_data$Close, omg_fit_data$Close_m3, use = "complete.obs"), "\n")
## Close Minus 3: 0.9387131
cat("Transactions: ", cor(trn_fit_data$Close, trn_fit_data$Transactions), "\n")
## Transactions: 0.5137099
cat("Total Token Amount: ", cor(trn_fit_data$Close, trn_fit_data$Tot_Tok_Amt), "\n")
## Total Token Amount: 0.194449
cat("Average Token Amount:", cor(trn_fit_data$Close, trn_fit_data$Avg_Tok_Amt), "\n")
## Average Token Amount: -0.1125508
cat("Distinct Buyers: ", cor(trn_fit_data$Close, trn_fit_data$Distinct_Buyers), "\n")
## Distinct Buyers: 0.8720125
cat("Distinct Sellers: ", cor(trn_fit_data$Close, trn_fit_data$Distinct_Sellers), "\n")
## Distinct Sellers: 0.243085
cat("Close Minus 1: ", cor(trn_fit_data$Close, trn_fit_data$Close_m1, use = "complete.obs"), "\n")
## Close Minus 1: 0.9615547
cat("Close Minus 2: ", cor(trn_fit_data$Close, trn_fit_data$Close_m2, use = "complete.obs"), "\n")
## Close Minus 2: 0.9165609
cat("Close Minus 3: ", cor(trn_fit_data$Close, trn_fit_data$Close_m3, use = "complete.obs"), "\n")
## Close Minus 3: 0.86749
cat("Transactions: ", cor(yoc_fit_data$Close, yoc_fit_data$Transactions), "\n")
## Transactions: -0.03078097
cat("Total Token Amount: ", cor(yoc_fit_data$Close, yoc_fit_data$Tot_Tok_Amt), "\n")
## Total Token Amount: -0.2928949
cat("Average Token Amount:", cor(yoc_fit_data$Close, yoc_fit_data$Avg_Tok_Amt), "\n")
## Average Token Amount: -0.2878166
cat("Distinct Buyers: ", cor(yoc_fit_data$Close, yoc_fit_data$Distinct_Buyers), "\n")
## Distinct Buyers: 0.3815441
cat("Distinct Sellers: ", cor(yoc_fit_data$Close, yoc_fit_data$Distinct_Sellers), "\n")
## Distinct Sellers: 0.05851667
cat("Close Minus 1: ", cor(yoc_fit_data$Close, yoc_fit_data$Close_m1, use = "complete.obs"), "\n")
## Close Minus 1: 0.9806905
cat("Close Minus 2: ", cor(yoc_fit_data$Close, yoc_fit_data$Close_m2, use = "complete.obs"), "\n")
## Close Minus 2: 0.9751215
cat("Close Minus 3: ", cor(yoc_fit_data$Close, yoc_fit_data$Close_m3, use = "complete.obs"), "\n")
## Close Minus 3: 0.9702466
Time to actually perform the fit via multiple linear regression. We will split each coin into two different models. The first considering the previous 3 days of Close prices. The second only focusing on the features we engineered. As the previous three days were so highly correlated with the price, they make the R^2 value significantly higher and we lose some understanding of which one of the engineered features actually contributes the most.
Note: The models ending in “_hist" take the price history for the three previous days into account. Those with “_no_hist" endings do not.
omg_fit_hist <- lm(
Close ~ Avg_Tok_Amt +
Tot_Tok_Amt +
Transactions +
Distinct_Buyers +
Distinct_Sellers +
Close_m1 +
Close_m2 +
Close_m3,
data=omg_fit_data)
omg_fit_no_hist <- lm(
Close ~ Avg_Tok_Amt +
Tot_Tok_Amt +
Transactions +
Distinct_Buyers +
Distinct_Sellers,
data=omg_fit_data)
trn_fit_hist <- lm(
Close ~ Avg_Tok_Amt +
Tot_Tok_Amt +
Transactions +
Distinct_Buyers +
Distinct_Sellers +
Close_m1 +
Close_m2 +
Close_m3,
data=trn_fit_data)
trn_fit_no_hist <- lm(
Close ~ Avg_Tok_Amt +
Tot_Tok_Amt +
Transactions +
Distinct_Buyers +
Distinct_Sellers,
data=trn_fit_data)
yoc_fit_hist <- lm(
Close ~ Avg_Tok_Amt +
Tot_Tok_Amt +
Transactions +
Distinct_Buyers +
Distinct_Sellers +
Close_m1 +
Close_m2 +
Close_m3,
data=yoc_fit_data)
yoc_fit_no_hist <- lm(
Close ~ Avg_Tok_Amt +
Tot_Tok_Amt +
Transactions +
Distinct_Buyers +
Distinct_Sellers,
data=yoc_fit_data)
print(summary(omg_fit_hist))
##
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions +
## Distinct_Buyers + Distinct_Sellers + Close_m1 + Close_m2 +
## Close_m3, data = omg_fit_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8643 -0.4799 -0.0258 0.5055 4.9258
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.347e-02 2.614e-01 -0.281 0.7788
## Avg_Tok_Amt -6.106e-23 5.994e-23 -1.019 0.3092
## Tot_Tok_Amt 1.926e-25 8.781e-26 2.193 0.0291 *
## Transactions 1.266e-04 8.702e-05 1.455 0.1467
## Distinct_Buyers 1.225e-02 7.139e-03 1.716 0.0872 .
## Distinct_Sellers 7.825e-05 1.405e-04 0.557 0.5780
## Close_m1 9.173e-01 5.791e-02 15.842 <2e-16 ***
## Close_m2 3.376e-02 7.878e-02 0.429 0.6686
## Close_m3 -2.060e-03 5.695e-02 -0.036 0.9712
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.056 on 285 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.96, Adjusted R-squared: 0.9589
## F-statistic: 855 on 8 and 285 DF, p-value: < 2.2e-16
plot(omg_fit_hist)
print(summary(omg_fit_no_hist))
##
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions +
## Distinct_Buyers + Distinct_Sellers, data = omg_fit_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.2808 -3.2030 -0.2738 2.4096 11.4253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.341e+00 8.543e-01 3.911 0.000115 ***
## Avg_Tok_Amt -2.385e-22 1.331e-22 -1.791 0.074285 .
## Tot_Tok_Amt -1.296e-25 2.897e-25 -0.447 0.654941
## Transactions 1.448e-03 3.159e-04 4.585 6.75e-06 ***
## Distinct_Buyers 2.335e-01 2.283e-02 10.227 < 2e-16 ***
## Distinct_Sellers -1.334e-03 5.184e-04 -2.573 0.010573 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.99 on 291 degrees of freedom
## Multiple R-squared: 0.4418, Adjusted R-squared: 0.4322
## F-statistic: 46.06 on 5 and 291 DF, p-value: < 2.2e-16
plot(omg_fit_no_hist)
print(summary(trn_fit_hist))
##
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions +
## Distinct_Buyers + Distinct_Sellers + Close_m1 + Close_m2 +
## Close_m3, data = trn_fit_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.030480 -0.002582 0.000472 0.001864 0.066202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.382e-03 8.301e-04 -1.665 0.09741 .
## Avg_Tok_Amt 1.331e-18 3.627e-17 0.037 0.97075
## Tot_Tok_Amt 1.182e-19 1.597e-19 0.740 0.45994
## Transactions -3.004e-06 1.580e-06 -1.901 0.05861 .
## Distinct_Buyers 3.328e-05 5.320e-06 6.256 1.98e-09 ***
## Distinct_Sellers 2.639e-06 1.730e-06 1.526 0.12846
## Close_m1 8.077e-01 5.785e-02 13.962 < 2e-16 ***
## Close_m2 4.101e-02 7.806e-02 0.525 0.59987
## Close_m3 -1.467e-01 5.462e-02 -2.685 0.00779 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.008241 on 224 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.9525, Adjusted R-squared: 0.9508
## F-statistic: 561.2 on 8 and 224 DF, p-value: < 2.2e-16
plot(trn_fit_hist)
print(summary(trn_fit_no_hist))
##
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions +
## Distinct_Buyers + Distinct_Sellers, data = trn_fit_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.080304 -0.007163 -0.001766 0.003777 0.068697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.037e-03 1.617e-03 -0.641 0.522
## Avg_Tok_Amt 1.178e-17 7.153e-17 0.165 0.869
## Tot_Tok_Amt -4.220e-19 3.077e-19 -1.372 0.172
## Transactions -1.989e-05 2.809e-06 -7.081 1.72e-11 ***
## Distinct_Buyers 1.264e-04 7.336e-06 17.229 < 2e-16 ***
## Distinct_Sellers 2.007e-05 3.113e-06 6.449 6.57e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01628 on 230 degrees of freedom
## Multiple R-squared: 0.8115, Adjusted R-squared: 0.8074
## F-statistic: 198.1 on 5 and 230 DF, p-value: < 2.2e-16
plot(trn_fit_no_hist)
print(summary(yoc_fit_hist))
##
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions +
## Distinct_Buyers + Distinct_Sellers + Close_m1 + Close_m2 +
## Close_m3, data = yoc_fit_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.052084 -0.001512 -0.000703 0.001339 0.044489
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.171e-03 7.241e-04 1.617 0.106554
## Avg_Tok_Amt -2.921e-25 5.028e-25 -0.581 0.561621
## Tot_Tok_Amt -4.565e-30 4.740e-27 -0.001 0.999232
## Transactions -9.850e-07 9.762e-07 -1.009 0.313549
## Distinct_Buyers -1.245e-05 2.951e-05 -0.422 0.673353
## Distinct_Sellers 2.688e-06 5.304e-06 0.507 0.612531
## Close_m1 5.855e-01 5.028e-02 11.644 < 2e-16 ***
## Close_m2 2.244e-01 5.732e-02 3.915 0.000106 ***
## Close_m3 1.595e-01 5.007e-02 3.185 0.001558 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007628 on 410 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.9661, Adjusted R-squared: 0.9654
## F-statistic: 1461 on 8 and 410 DF, p-value: < 2.2e-16
plot(yoc_fit_hist)
print(summary(yoc_fit_no_hist))
##
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions +
## Distinct_Buyers + Distinct_Sellers, data = yoc_fit_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.064999 -0.026200 -0.005130 0.007418 0.138106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.553e-02 3.040e-03 11.688 < 2e-16 ***
## Avg_Tok_Amt -5.981e-24 2.411e-24 -2.481 0.0135 *
## Tot_Tok_Amt -4.511e-26 2.278e-26 -1.981 0.0483 *
## Transactions -2.628e-05 4.527e-06 -5.806 1.27e-08 ***
## Distinct_Buyers 1.118e-03 1.278e-04 8.749 < 2e-16 ***
## Distinct_Sellers 4.069e-05 2.521e-05 1.614 0.1072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03683 on 416 degrees of freedom
## Multiple R-squared: 0.2544, Adjusted R-squared: 0.2454
## F-statistic: 28.39 on 5 and 416 DF, p-value: < 2.2e-16
plot(yoc_fit_no_hist)
We find that including the last three days of close prices really overpowers any gains we make via our engineered regressors. All three give us values over .95 for R^2 which is great! Unfortunately it this will not be able to predict quick spikes or drops in the price as it is simply going to estimate a linear trajectory based on the previous days’ action.
If we disregard the previous days’ close prices, we are able to get the follwing R^2 values after [manually] experimenting with K values representing the top K buyers. - OMG: 0.4418 (K=104) - TRN: ~0.8115 (K=~18,000) - YOC: 0.2551 (K=135)
Note that TRN’s K value which produced the highest R^2 Value was exceptionally high compared to OMG and YOC. We plan to explore why this was the case in our writeup.